This past June, I did a sentiment analysis of the Kuczaj Corpus ADD LINK HERE from the CHILDES database for my final project in the Data Visualization class taught by Alison Presmanes Hill, Steven Bedrick, and Jackie Wirz. During my presentation some of the questions that came up were: Does the total number of words per transcript vary a lot? How much is this effecting the sentiment analysis? What would happen to the plot if I normalized for transcript length?

The length of the transcripts does vary a great deal, both before and after processing and filtering against the nrc sentiment lexicon. I had actually noticed this artifact in the dataset when working on the project and had plans to address it. I ended up not having enough time to explore different normalization techniques and included a limitations section discussing how this could affect the visualizations I created.

Now, two months later, I am revisiting these questions and am going to find out what will happen if I normalize for transcript length!

Here are the packages that I will be using:

# Loading packages
library(tidyverse)
library(tidytext)
library(forcats)
library(skimr)
library(egg)

I saved the version of the dataset that I used to create the ridgeline density plots as a csv file so I could pick up where I left off.

# reading in data
kuczaj <- read_csv("data/kuczaj_nrc.csv")     

Here’s a glimpse of what kuczaj looks like:

glimpse(kuczaj)
## Observations: 22,893
## Variables: 4
## $ age_months <dbl> 29.00060, 29.00060, 29.00060, 29.00060, 29.00060, 2...
## $ age_years  <dbl> 2.416716, 2.416716, 2.416716, 2.416716, 2.416716, 2...
## $ word       <chr> "hurt", "hurt", "hurt", "hurt", "break", "cry", "cr...
## $ sentiment  <chr> "anger", "fear", "negative", "sadness", "surprise",...

Since I only worked with the “trust”, “joy”, “anticipation”, “sadness”, “fear”, and “anger” sentiments last time, I am going to filter out all other sentiments from the dataframe. I’m also going to coerce sentiment to a factor and will order its levels with the positively associated ones before the negatively associated ones to make plotting easier later on.

# sentiments to keep
sentiment_levels <- c("trust", "joy", "anticipation", 
                      "sadness", "fear", "anger")

# making sentiment a factor
kuczaj <- kuczaj %>% 
  select(-age_years, -word) %>% # removing unneeded columns
  filter(sentiment %in% sentiment_levels) %>% 
  mutate(sentiment = factor(sentiment, levels = sentiment_levels))

Since there are only partial observations for the first and last month, removing them:

kuczaj <- kuczaj %>% 
  filter(age_months >= 29 & age_months <= 60)
kuczaj2 <- kuczaj %>% 
  add_count(age_months, sentiment) %>% 
  rename(n_sentiment = n) %>% 
  distinct(age_months, sentiment, .keep_all = TRUE)


# Adding in count for total tokens per transcript that were kept
kuczaj2 <- kuczaj2 %>% 
  group_by(age_months) %>% 
  mutate(n_tokens = sum(n_sentiment)) %>% 
  ungroup()

skim(kuczaj2)
## Skim summary statistics
##  n obs: 1192 
##  n variables: 4 
## 
## Variable type: factor 
##   variable missing complete    n n_unique
##  sentiment       0     1192 1192        6
##                              top_counts ordered
##  joy: 203, ant: 203, tru: 202, sad: 199   FALSE
## 
## Variable type: integer 
##     variable missing complete    n  mean    sd p0 p25 p50 p75 p100
##  n_sentiment       0     1192 1192 10.82  7.98  1   5   9  14   60
##     n_tokens       0     1192 1192 64.02 30.23  7  45  58  77  174
##      hist
##  ▇▆▂▁▁▁▁▁
##  ▂▆▇▃▂▁▁▁
## 
## Variable type: numeric 
##    variable missing complete    n  mean   sd p0   p25   p50   p75  p100
##  age_months       0     1192 1192 42.72 8.46 29 35.43 42.13 49.17 59.89
##      hist
##  ▇▇▇▇▇▅▃▅
aux <- kuczaj2 %>% 
  tidyr::expand(nesting(age_months, n_tokens), sentiment)

kuczaj2 <- aux %>% 
  left_join(kuczaj2, by = c("age_months", "n_tokens", "sentiment")) %>% 
  replace_na(list(n_sentiment = 0))


kuczaj2 <- kuczaj2 %>% 
  mutate(percent = n_sentiment/n_tokens)

Plot 1

ggplot(kuczaj2, aes(age_months, percent, color = sentiment)) +
  geom_point(alpha = 0.7) +
  facet_wrap(~ sentiment) + 
  labs(x = "Age (months)", y = "Percent") +
  guides(color = FALSE) +
  theme_minimal()

Trying age binned this time

ages_binned <- kuczaj %>% 
  mutate(age_months = floor(age_months)) 

ages_binned <- ages_binned %>% 
  add_count(age_months, sentiment) %>% 
  rename(n_sentiment = n) %>% 
  distinct(age_months, sentiment, .keep_all = TRUE)


# Adding in count for total tokens per transcript that were kept
ages_binned <- ages_binned %>% 
  group_by(age_months) %>% 
  mutate(n_tokens = sum(n_sentiment)) %>% 
  ungroup()

skim(ages_binned)
## Skim summary statistics
##  n obs: 186 
##  n variables: 4 
## 
## Variable type: factor 
##   variable missing complete   n n_unique
##  sentiment       0      186 186        6
##                          top_counts ordered
##  tru: 31, joy: 31, ant: 31, sad: 31   FALSE
## 
## Variable type: integer 
##     variable missing complete   n   mean     sd  p0 p25 p50    p75 p100
##  n_sentiment       0      186 186  69.34  41.95  10  38  61  90.75  234
##     n_tokens       0      186 186 416.06 192.51 124 271 354 614     862
##      hist
##  ▇▇▆▂▂▁▁▁
##  ▃▅▇▂▂▃▂▁
## 
## Variable type: numeric 
##    variable missing complete   n mean   sd p0 p25 p50 p75 p100     hist
##  age_months       0      186 186   44 8.97 29  36  44  52   59 ▇▇▇▇▆▇▇▇
aux2 <- ages_binned %>% 
  tidyr::expand(nesting(age_months, n_tokens), sentiment)

ages_binned2 <- aux2 %>% 
  left_join(ages_binned, by = c("age_months", "n_tokens", "sentiment")) %>% 
  replace_na(list(n_sentiment = 0))


ages_binned2 <- ages_binned2 %>% 
  mutate(percent = n_sentiment/n_tokens)

Adding in a column for positive and negative groups

ages_binned2 <- ages_binned2 %>% 
  mutate(type = ifelse(sentiment %in% c("trust", "joy", "anticipation"), 
                       "positive", "negative"))

Plot 2

ggplot(ages_binned2, aes(age_months, percent, fill = sentiment)) +
  geom_col(alpha = 0.7) +
  facet_wrap(~ sentiment) + 
  labs(x = "Age (months)", y = "Percent", title = "Ages binned") +
  guides(fill = FALSE) +
  theme_minimal()

ggplot(ages_binned2, aes(age_months, percent, fill = sentiment)) +
  geom_col(alpha = 0.7) +
  facet_grid(sentiment ~ .)+
  labs(x = "Age (months)", y = "Percent", title = "Ages binned") +
  guides(fill = FALSE) +
  theme_minimal()

positive_plot <- ages_binned2 %>% 
  filter(type == "positive") %>% 
  ggplot(aes(age_months, percent, fill = sentiment)) +
    geom_col(alpha = 0.7) +
    facet_wrap(~sentiment, ncol = 1) +
    geom_vline(xintercept = c(36, 48), color = "black") +
    geom_vline(xintercept = c(29, 59), color = "black", linetype = "dotted") +
    # changing appearance
    theme_minimal() +
    labs(x = "", y = "") +
    guides(fill = FALSE) +
    scale_fill_manual(values = c("#97B8C7", "#AEC9C3", "#7FCCD3")) +
    scale_y_continuous(breaks = c(0.05, 0.15, 0.25),
                       labels = c("5%", "15%", "25%")) +
    scale_x_continuous(breaks = c(29, 36, 48, 59),
                       labels = c("2.4 yrs", "3 yrs", "4 yrs", "4.9 yrs")) +
    theme(strip.text = element_text(size = 13, face = "italic"),
          axis.text.x = element_text(face = "italic"),
          axis.text.y = element_text(face = "italic"))


negative_plot <- ages_binned2 %>% 
  filter(type == "negative") %>% 
  ggplot(aes(age_months, percent, fill = sentiment)) +
    geom_col(alpha = 0.7) +
    facet_wrap(~sentiment, ncol = 1) +
    geom_vline(xintercept = c(36, 48), color = "black") +
    geom_vline(xintercept = c(29, 59), color = "black", linetype = "dotted") +
    # changing apperance
    theme_minimal() +
    labs(x = "", y = "") +
    guides(fill = FALSE) +
    scale_fill_manual(values = c("#21132B", "#4F406E", "#6C7399")) +
    scale_y_continuous(breaks = c(0.05, 0.15, 0.25),
                       labels = c("5%", "15%", "25%")) +
    scale_x_continuous(breaks = c(29, 36, 48, 59),
                       labels = c("2.4 yrs", "3 yrs", "4 yrs", "4.9 yrs")) +
    theme(strip.text = element_text(size = 15, face = "italic"),
          axis.text.x = element_text(face = "italic"),
          axis.text.y = element_text(face = "italic"))

ggarrange(positive_plot, negative_plot, ncol = 2, nrow = 1)

Same plot but opposite layout

ggplot(ages_binned2, aes(age_months, percent, fill = sentiment)) +
  geom_col(alpha = 0.7) +
  facet_wrap(~ sentiment) + 
  geom_vline(xintercept = c(36, 48), color = "black") +
  geom_vline(xintercept = c(29, 59), color = "black", linetype = "dotted") +
  theme_minimal() +
  labs(x = "", y = "") +
  guides(fill = FALSE) +
  scale_fill_manual(values = c("#97B8C7", "#AEC9C3", "#7FCCD3",
                               "#21132B", "#4F406E", "#6C7399")) +
  scale_y_continuous(breaks = c(0.05, 0.15, 0.25),
                     labels = c("5%", "15%", "25%")) +
  scale_x_continuous(breaks = c(29, 36, 48, 59),
                     labels = c("2.4 yrs", "3 yrs", "4 yrs", "4.9 yrs")) +
  theme(strip.text = element_text(size = 15, face = "bold.italic"),
        axis.text.x = element_text(size = 10, face = "italic"),
        axis.text.y = element_text(size = 10, face = "italic"))

ggplot(ages_binned2, aes(age_months, percent, color = sentiment, linetype = sentiment)) +
  geom_line(size = 1) +
  scale_color_manual(values = c("#21132B", "#4F406E", "#6C7399",
                               "#97B8C7", "#AEC9C3", "#7FCCD3")) +
  theme_minimal()